We will use USArrests data set which is a part of the base R.
library(ISLR)
states <- row.names(USArrests)
names(USArrests)## [1] "Murder" "Assault" "UrbanPop" "Rape"
apply(USArrests, 2, mean)## Murder Assault UrbanPop Rape
## 7.788 170.760 65.540 21.232
apply(USArrests, 2, var) ## Murder Assault UrbanPop Rape
## 18.97047 6945.16571 209.51878 87.72916
There are several packages to conduct PCA analysis (prcomp, princomp, etc.) We standardize each variable using the option scale=TRUE
pr.out <- prcomp(USArrests, scale=TRUE)
names(pr.out) ## [1] "sdev" "rotation" "center" "scale" "x"
The center and scale components correspond to the means and standard deviations of the variables that were used for scaling prior to implementing PCA.
pr.out$center## Murder Assault UrbanPop Rape
## 7.788 170.760 65.540 21.232
pr.out$scale## Murder Assault UrbanPop Rape
## 4.355510 83.337661 14.474763 9.366385
The rotation matrix provides the principal component loadings; each column of pr.out$rotation contains the corresponding principal component loading vector
pr.out$rotation## PC1 PC2 PC3 PC4
## Murder -0.5358995 0.4181809 -0.3412327 0.64922780
## Assault -0.5831836 0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158 0.13387773
## Rape -0.5434321 -0.1673186 0.8177779 0.08902432
Principle component score vectors:
dim(pr.out$x) ## [1] 50 4
We can plot the first two principal components as follows
biplot(pr.out, scale=0, cex=0.6)The scale=0 argument to biplot() ensures that the arrows are scaled to represent the loadings; other values for scale give slightly different biplots with different interpretations.
Notice that this figure is a mirror image of Figure 10.1. Recall that the principal components are only unique up to a sign change, so we can reproduce Figure 10.1 by making a few small changes
pr.out$rotation <- -pr.out$rotation
pr.out$x <- -pr.out$x
biplot(pr.out, scale=0, cex=0.6)Importance of principal components:
summary(pr.out)## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.5749 0.9949 0.59713 0.41645
## Proportion of Variance 0.6201 0.2474 0.08914 0.04336
## Cumulative Proportion 0.6201 0.8675 0.95664 1.00000
We see that the first principal component explains 62.0% of the variance in the data, the next principal component explains 24.7% of the variance, and so forth.
screeplot(pr.out, type="lines") pr.out$sdev## [1] 1.5748783 0.9948694 0.5971291 0.4164494
pr.var <- pr.out$sdev^2
pr.var## [1] 2.4802416 0.9897652 0.3565632 0.1734301
pve <- pr.var/sum(pr.var)
pve## [1] 0.62006039 0.24744129 0.08914080 0.04335752
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')This data set contains a set of body measurements along with age, height, weight and gender on 507 individuals (247 men 260 women). We expect some of these variables to be highly correlated.
# body data from
# Journal of Statistics Education Volume 11, Number 2 (2003),
# www.amstat.org/publications/jse/v11n2/datasets.heinz.html
load("../Data/body.RData")
bodydata <- subset(body, select = -c(age, gender, gender_dummy))
str(bodydata)## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 507 obs. of 23 variables:
## $ biacromial_diameter : num 42.9 43.7 40.1 44.3 42.5 43.3 43.5 44.4 43.5 42 ...
## $ biiliac_diameter : num 26 28.5 28.2 29.9 29.9 27 30 29.8 26.5 28 ...
## $ bitrochanteric_diameter: num 31.5 33.5 33.3 34 34 31.5 34 33.2 32.1 34 ...
## $ chest_depth : num 17.7 16.9 20.9 18.4 21.5 19.6 21.9 21.8 15.5 22.5 ...
## $ chest_diameter : num 28 30.8 31.7 28.2 29.4 31.3 31.7 28.8 27.5 28 ...
## $ elbow_diameter : num 13.1 14 13.9 13.9 15.2 14 16.1 15.1 14.1 15.6 ...
## $ wrist_diameter : num 10.4 11.8 10.9 11.2 11.6 11.5 12.5 11.9 11.2 12 ...
## $ knee_diameter : num 18.8 20.6 19.7 20.9 20.7 18.8 20.8 21 18.9 21.1 ...
## $ ankle_diameter : num 14.1 15.1 14.1 15 14.9 13.9 15.6 14.6 13.2 15 ...
## $ shoulder_girth : num 106 110 115 104 108 ...
## $ chest_girth : num 89.5 97 97.5 97 97.5 ...
## $ waist_girth : num 71.5 79 83.2 77.8 80 82.5 82 76.8 68.5 77.5 ...
## $ abdominal_girth : num 74.5 86.5 82.9 78.8 82.5 80.1 84 80.5 69 81.5 ...
## $ hip_girth : num 93.5 94.8 95 94 98.5 95.3 101 98 89.5 99.8 ...
## $ thigh_girth : num 51.5 51.5 57.3 53 55.4 57.5 60.9 56 50 59.8 ...
## $ bicep_girth : num 32.5 34.4 33.4 31 32 33 42.4 34.1 33 36.5 ...
## $ forearm_girth : num 26 28 28.8 26.2 28.4 28 32.3 28 26 29.2 ...
## $ knee_girth : num 34.5 36.5 37 37 37.7 36.6 40.1 39.2 35.5 38.3 ...
## $ calf_girth : num 36.5 37.5 37.3 34.8 38.6 36.1 40.3 36.7 35 38.6 ...
## $ ankle_girth : num 23.5 24.5 21.9 23 24.4 23.5 23.6 22.5 22 22.2 ...
## $ wrist_girth : num 16.5 17 16.9 16.6 18 16.9 18.8 18 16.5 16.9 ...
## $ weight : num 65.6 71.8 80.7 72.6 78.8 74.8 86.4 78.4 62 81.6 ...
## $ height : num 174 175 194 186 187 ...
Inspecting the correlation matrix we see that these features are in fact highly correlated.
library(ggcorrplot)
cormat <- round(cor(bodydata), 2)
ggcorrplot(cormat, hc.order = TRUE, type = "lower", outline.color = "white")Find the principal components:
pr.out <- prcomp(bodydata, scale=TRUE)
summary(pr.out) ## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.8644 1.5758 1.03211 0.95475 0.69342 0.65851 0.58162
## Proportion of Variance 0.6493 0.1080 0.04632 0.03963 0.02091 0.01885 0.01471
## Cumulative Proportion 0.6493 0.7572 0.80357 0.84320 0.86410 0.88296 0.89767
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.5674 0.52727 0.52186 0.49794 0.45082 0.42002 0.39670
## Proportion of Variance 0.0140 0.01209 0.01184 0.01078 0.00884 0.00767 0.00684
## Cumulative Proportion 0.9117 0.92375 0.93559 0.94637 0.95521 0.96288 0.96972
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.38094 0.3490 0.32002 0.29400 0.2835 0.23619 0.21552
## Proportion of Variance 0.00631 0.0053 0.00445 0.00376 0.0035 0.00243 0.00202
## Cumulative Proportion 0.97603 0.9813 0.98578 0.98954 0.9930 0.99546 0.99748
## PC22 PC23
## Standard deviation 0.19313 0.1437
## Proportion of Variance 0.00162 0.0009
## Cumulative Proportion 0.99910 1.0000
# change the signs of factor loadings
pr.out$rotation <- -pr.out$rotation
pr.out$x <- -pr.out$x
biplot(pr.out, scale=0, cex=0.6)pr.var <- pr.out$sdev^2
pve <- pr.var/sum(pr.var)
pve## [1] 0.6492846086 0.1079652790 0.0463153491 0.0396329172 0.0209057316
## [6] 0.0188539644 0.0147079693 0.0139999780 0.0120875732 0.0118406532
## [11] 0.0107802578 0.0088363210 0.0076704395 0.0068423027 0.0063093336
## [16] 0.0052965553 0.0044528583 0.0037581598 0.0034951865 0.0024254119
## [21] 0.0020194249 0.0016217326 0.0008979928
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')Approximately 85% of the total variation can be explained by the first 5 principal components.
# load the data set
load("../Data/lifedata2015.RData")
# exclude the first column and send it to prcomp() function
pr_happiness <- prcomp(lifedata2015[,-1], scale=TRUE)
summary(pr_happiness)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.6487 2.7994 1.61778 1.44839 1.31128 1.23677 1.08694
## Proportion of Variance 0.3247 0.1911 0.06383 0.05117 0.04194 0.03731 0.02882
## Cumulative Proportion 0.3247 0.5158 0.57968 0.63084 0.67278 0.71009 0.73890
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.05930 0.99654 0.94201 0.9034 0.84682 0.78979 0.76944
## Proportion of Variance 0.02737 0.02422 0.02164 0.0199 0.01749 0.01521 0.01444
## Cumulative Proportion 0.76627 0.79049 0.81214 0.8320 0.84953 0.86474 0.87918
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.76204 0.7189 0.6500 0.63497 0.59349 0.55320 0.51029
## Proportion of Variance 0.01416 0.0126 0.0103 0.00983 0.00859 0.00746 0.00635
## Cumulative Proportion 0.89335 0.9060 0.9163 0.92609 0.93468 0.94215 0.94850
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.4958 0.48471 0.45005 0.43610 0.40353 0.39223 0.36695
## Proportion of Variance 0.0060 0.00573 0.00494 0.00464 0.00397 0.00375 0.00328
## Cumulative Proportion 0.9545 0.96022 0.96516 0.96980 0.97377 0.97752 0.98081
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.33688 0.33084 0.30826 0.2862 0.27626 0.26303 0.24261
## Proportion of Variance 0.00277 0.00267 0.00232 0.0020 0.00186 0.00169 0.00144
## Cumulative Proportion 0.98358 0.98625 0.98856 0.9906 0.99242 0.99411 0.99555
## PC36 PC37 PC38 PC39 PC40 PC41
## Standard deviation 0.22405 0.1925 0.17796 0.16091 0.15358 0.11901
## Proportion of Variance 0.00122 0.0009 0.00077 0.00063 0.00058 0.00035
## Cumulative Proportion 0.99677 0.9977 0.99845 0.99908 0.99965 1.00000
# change the signs of factor loadings
# and plot the biplot
pr_happiness$rotation <- -pr_happiness$rotation
pr_happiness$x <- -pr_happiness$x
# biplot
biplot(pr_happiness, scale=0, cex=0.6)# compute proportion of variance explained
pr.var <- pr_happiness$sdev^2
pve <- pr.var/sum(pr.var)
pve## [1] 0.3247032515 0.1911377271 0.0638347339 0.0511669356 0.0419376566
## [6] 0.0373075532 0.0288154928 0.0273685063 0.0242215157 0.0216434975
## [11] 0.0199036321 0.0174903839 0.0152139590 0.0144399072 0.0141635465
## [16] 0.0126038973 0.0103042710 0.0098339512 0.0085909019 0.0074641692
## [21] 0.0063510649 0.0059953400 0.0057303036 0.0049400620 0.0046386738
## [26] 0.0039715287 0.0037522269 0.0032841890 0.0027679669 0.0026696605
## [31] 0.0023177290 0.0019980467 0.0018614512 0.0016873924 0.0014356089
## [36] 0.0012243745 0.0009042744 0.0007723912 0.0006314968 0.0005752529
## [41] 0.0003454763
# plot PVE
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')# cumulative proportion of variance explained
plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')Approximately 80% of the variation in the data can be explained by the first 10 principal components.
An alternative package:
library(factoextra)
# PCA using factomineR
library(FactoMineR)
res_pca_lifedata <- PCA(lifedata2015[,-1], graph = FALSE)
get_eig(res_pca_lifedata)## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 13.31283331 32.47032515 32.47033
## Dim.2 7.83664681 19.11377271 51.58410
## Dim.3 2.61722409 6.38347339 57.96757
## Dim.4 2.09784436 5.11669356 63.08426
## Dim.5 1.71944392 4.19376566 67.27803
## Dim.6 1.52960968 3.73075532 71.00879
## Dim.7 1.18143520 2.88154928 73.89034
## Dim.8 1.12210876 2.73685063 76.62719
## Dim.9 0.99308215 2.42215157 79.04934
## Dim.10 0.88738340 2.16434975 81.21369
## Dim.11 0.81604892 1.99036321 83.20405
## Dim.12 0.71710574 1.74903839 84.95309
## Dim.13 0.62377232 1.52139590 86.47448
## Dim.14 0.59203619 1.44399072 87.91848
## Dim.15 0.58070541 1.41635465 89.33483
## Dim.16 0.51675979 1.26038973 90.59522
## Dim.17 0.42247511 1.03042710 91.62565
## Dim.18 0.40319200 0.98339512 92.60904
## Dim.19 0.35222698 0.85909019 93.46813
## Dim.20 0.30603094 0.74641692 94.21455
## Dim.21 0.26039366 0.63510649 94.84966
## Dim.22 0.24580894 0.59953400 95.44919
## Dim.23 0.23494245 0.57303036 96.02222
## Dim.24 0.20254254 0.49400620 96.51623
## Dim.25 0.19018562 0.46386738 96.98009
## Dim.26 0.16283268 0.39715287 97.37725
## Dim.27 0.15384130 0.37522269 97.75247
## Dim.28 0.13465175 0.32841890 98.08089
## Dim.29 0.11348664 0.27679669 98.35768
## Dim.30 0.10945608 0.26696605 98.62465
## Dim.31 0.09502689 0.23177290 98.85642
## Dim.32 0.08191992 0.19980467 99.05623
## Dim.33 0.07631950 0.18614512 99.24237
## Dim.34 0.06918309 0.16873924 99.41111
## Dim.35 0.05885996 0.14356089 99.55467
## Dim.36 0.05019935 0.12243745 99.67711
## Dim.37 0.03707525 0.09042744 99.76754
## Dim.38 0.03166804 0.07723912 99.84478
## Dim.39 0.02589137 0.06314968 99.90793
## Dim.40 0.02358537 0.05752529 99.96545
## Dim.41 0.01416453 0.03454763 100.00000
# Scree plot
fviz_screeplot(res_pca_lifedata, addlabels = TRUE)Let’s apply PCR approach to the regional happiness data. We want to predict provincial happiness levels using the first few principal components.
We can use pcr() function in the {pls} package for that purpose.
library(pls)
set.seed(123)
pcr_fit <- pcr(happiness_level ~ . -province,
data = lifedata2015,
scale=TRUE,
validation="LOO") # use leave-one-out cross-validation
summary(pcr_fit)## Data: X dimension: 81 40
## Y dimension: 81 1
## Fit method: svdpc
## Number of components considered: 40
##
## VALIDATION: RMSEP
## Cross-validated using 81 leave-one-out segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 7.58 7.590 6.696 6.850 6.363 6.042 5.698
## adjCV 7.58 7.589 6.694 6.849 6.361 6.013 5.691
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 5.744 5.565 5.438 5.108 5.056 5.191 5.114
## adjCV 5.744 5.557 5.431 5.090 5.047 5.185 5.102
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## CV 5.148 5.425 5.550 5.693 5.341 4.828 4.835
## adjCV 5.137 5.420 5.542 5.691 5.286 4.815 4.825
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps 27 comps
## CV 4.857 4.903 4.981 5.068 5.231 5.398 5.404
## adjCV 4.847 4.897 4.970 5.058 5.220 5.387 5.392
## 28 comps 29 comps 30 comps 31 comps 32 comps 33 comps 34 comps
## CV 5.443 5.588 5.636 5.613 5.804 5.994 6.185
## adjCV 5.429 5.576 5.620 5.598 5.788 5.977 6.167
## 35 comps 36 comps 37 comps 38 comps 39 comps 40 comps
## CV 6.496 6.618 6.851 6.501 6.848 7.309
## adjCV 6.476 6.597 6.837 6.473 6.824 7.282
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 33.217 52.13 58.61 63.06 67.28 71.10 73.98
## happiness_level 2.339 26.62 27.94 44.79 55.72 56.07 57.85
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## X 76.73 79.16 81.31 83.35 85.13 86.65
## happiness_level 60.82 63.14 66.85 67.15 67.50 68.60
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## X 88.12 89.57 90.80 91.83 92.74 93.62
## happiness_level 69.02 69.44 71.35 71.63 77.56 77.60
## 20 comps 21 comps 22 comps 23 comps 24 comps 25 comps
## X 94.39 95.04 95.65 96.23 96.73 97.20
## happiness_level 77.63 77.71 77.72 78.16 78.24 78.46
## 26 comps 27 comps 28 comps 29 comps 30 comps 31 comps
## X 97.59 97.93 98.26 98.54 98.78 99.02
## happiness_level 78.53 79.28 79.89 79.91 80.54 80.80
## 32 comps 33 comps 34 comps 35 comps 36 comps 37 comps
## X 99.21 99.38 99.53 99.66 99.75 99.83
## happiness_level 80.81 80.84 80.93 81.07 81.07 81.12
## 38 comps 39 comps 40 comps
## X 99.90 99.96 100.00
## happiness_level 83.24 83.38 83.65
# how many principal components should we use?
# RMSE as a function of the number of PC as computed using CV
validationplot(pcr_fit)The minimum RMSE is achieved when we use 19 components.
pcr_fit <- pcr(happiness_level ~ . -province,
data = lifedata2015,
scale = TRUE,
ncomp = 19)
summary(pcr_fit)## Data: X dimension: 81 40
## Y dimension: 81 1
## Fit method: svdpc
## Number of components considered: 19
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 33.217 52.13 58.61 63.06 67.28 71.10 73.98
## happiness_level 2.339 26.62 27.94 44.79 55.72 56.07 57.85
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## X 76.73 79.16 81.31 83.35 85.13 86.65
## happiness_level 60.82 63.14 66.85 67.15 67.50 68.60
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## X 88.12 89.57 90.80 91.83 92.74 93.62
## happiness_level 69.02 69.44 71.35 71.63 77.56 77.60
The function kmeans() performs K-means clustering in R. We begin with a simple simulated example in which there truly are two clusters in the data: the first 25 observations have a mean shift relative to the next 25 observations.
# create a simulated data set
set.seed(2)
x <- matrix(rnorm(50*2), ncol=2)
x[1:25,1] <- x[1:25,1]+3
x[1:25,2] <- x[1:25,2]-4# perfom Kmeans clustering with 2 centers and 20 random starts
km.out <- kmeans(x, centers = 2, nstart = 20)
# print cluster information
km.out$cluster## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2
# plot clusters
plot(x, col = (km.out$cluster+1),
main = "K-Means Clustering Results with K=2",
xlab = "",
ylab = "",
pch = 20,
cex = 2) In this example, we knew that there really were two clusters because we generated the data. However, for real data, in general we do not know the true number of clusters. We could instead have performed K-means clustering on this example with K = 3.
set.seed(4)
km.out <- kmeans(x, centers = 3, nstart = 20)
km.out## K-means clustering with 3 clusters of sizes 17, 23, 10
##
## Cluster means:
## [,1] [,2]
## 1 3.7789567 -4.56200798
## 2 -0.3820397 -0.08740753
## 3 2.3001545 -2.69622023
##
## Clustering vector:
## [1] 1 3 1 3 1 1 1 3 1 3 1 3 1 3 1 3 1 1 1 1 1 3 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 3 2 3 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 25.74089 52.67700 19.56137
## (between_SS / total_SS = 79.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# plot the results with k=3
plot(x, col = (km.out$cluster+1),
main = "K-Means Clustering Results with K=3",
xlab = "",
ylab = "",
pch = 20,
cex = 2)When K = 3, K-means clustering splits up the two clusters. It is strongly recommended to run K-means clustering with a large value of nstart, such as 20 or 50, since otherwise an undesirable local optimum may be obtained.
# compute and print within sum of squares
set.seed(3)
km.out <- kmeans(x,3,nstart=1)
km.out$tot.withinss## [1] 97.97927
km.out <- kmeans(x,3,nstart=20)
km.out$tot.withinss## [1] 97.97927
# the data set was loaded previously
km.out <- kmeans(bodydata, centers = 2, nstart = 20)
plot(bodydata[,1:3], col=(km.out$cluster), cex=2)Plot clustering based on weight and height and compare to the actual gender factor:
# Large blank circles are the resulting clusters
plot(bodydata[,22:23],col=(km.out$cluster),cex=2)
# put dots inside circles representing observed gender
# red = men, black = women
points(body[,23:24], col=c(1,2)[body$gender], pch=19, cex=1)observed_class <- c(1,2)[body$gender]
km_cluster <- km.out$cluster
ratio <- sum(observed_class == km_cluster)/nrow(bodydata)
ratio## [1] 0.8402367
84% of observations seem to be clustered correctly. Note that if we know the clusters we do not need to apply K-Means clustering method. This was just for demonstration purposes. When we know the labels (like gender in the bodydata) there are better classification methods which we’ve discussed in our previous classes.
In this example we will use Turkish province data set and its subset.
# load packages
library(tidyverse) # tidy data manipulation
library(cluster) # clustering algorithms
library(factoextra) # clustering visualization # load data
load("../Data/lifedata2015.RData")
# make province names rownames
lifedata2015 <- column_to_rownames(lifedata2015, var = "province")# use only happiness level and some selected variables
happiness <- lifedata2015 %>%
select(happiness_level, life_exp, avr_daily_earnings, sat_rate_with_health_status)
head(happiness)## happiness_level life_exp avr_daily_earnings
## Adana 53.00 77.39335 59.06489
## Adiyaman 65.01 79.54820 53.24281
## Afyonkarahisar 76.43 76.99212 53.91157
## Agri 60.09 75.62830 56.10804
## Amasya 66.02 77.76714 53.77365
## Ankara 56.23 79.37352 70.14222
## sat_rate_with_health_status
## Adana 68.47
## Adiyaman 69.13
## Afyonkarahisar 80.07
## Agri 66.20
## Amasya 74.16
## Ankara 71.76
Display the correlation matrix of the happiness data:
library(ggcorrplot)
cormat <- round(cor(happiness), 2)
ggcorrplot(cormat, hc.order = TRUE, type = "lower", outline.color = "white")# visualize the distance function
# blue: more similar; red: dissimilar
# (look at the color legend, low value means more similar)
distance <- dist(scale(happiness))
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))# display few elements of distance matrix
as.matrix(distance)[1:6, 1:6]## Adana Adiyaman Afyonkarahisar Agri Amasya Ankara
## Adana 0.000000 2.768821 4.151958 2.060437 2.324195 2.686825
## Adiyaman 2.768821 0.000000 3.798892 3.916284 2.062164 2.889968
## Afyonkarahisar 4.151958 3.798892 0.000000 4.033355 2.057593 4.696319
## Agri 2.060437 3.916284 4.033355 0.000000 2.863795 4.407828
## Amasya 2.324195 2.062164 2.057593 2.863795 0.000000 3.253576
## Ankara 2.686825 2.889968 4.696319 4.407828 3.253576 0.000000
Apply K-means clustering:
# find cluster
happiness_scaled <- scale(happiness)
kmeans_happy <- kmeans(happiness_scaled, centers = 2, nstart=20)
# bivariate scatter plots
plot(happiness[,1:4],col=(kmeans_happy$cluster),cex=2)# cluster plot using principal components
# use factoextra::fviz_cluster() function to visualize clusters
# along with the first two principal components
fviz_cluster(kmeans_happy, data = happiness)# cluster plot using principal components
# without ellipses
fviz_cluster(kmeans_happy, geom = "point", ellipse = FALSE, data = happiness) +
ggtitle("K-means cluster of provinces with K=2")# PCA using factomineR
library(FactoMineR)
res.pca <- PCA(happiness, graph = FALSE)
get_eig(res.pca)## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 1.6875357 42.188394 42.18839
## Dim.2 1.0669411 26.673528 68.86192
## Dim.3 0.9111692 22.779229 91.64115
## Dim.4 0.3343540 8.358849 100.00000
# Scree plot
fviz_screeplot(res.pca, addlabels = TRUE, ylim = c(0, 50))# cluster plot using original data
happiness %>%
mutate(cluster = kmeans_happy$cluster,
province = row.names(happiness)) %>%
ggplot(aes(happiness_level, avr_daily_earnings, color = factor(cluster), label = province)) +
geom_text()# cluster plot using original data
happiness %>%
mutate(cluster = kmeans_happy$cluster,
province = row.names(happiness)) %>%
ggplot(aes(happiness_level, life_exp, color = factor(cluster), label = province)) +
geom_text()# cluster plot using original data
happiness %>%
mutate(cluster = kmeans_happy$cluster,
province = row.names(happiness)) %>%
ggplot(aes(happiness_level, sat_rate_with_health_status, color = factor(cluster), label = province)) +
geom_text()# compare and contrast different number of clusters
# plot different number of clusters
k2 <- kmeans(happiness, centers = 2, nstart = 25)
k3 <- kmeans(happiness, centers = 3, nstart = 25)
k4 <- kmeans(happiness, centers = 4, nstart = 25)
k5 <- kmeans(happiness, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = happiness) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = happiness) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = happiness) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = happiness) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)# determine the optimal number of clusters
set.seed(123)
# look for the elbow
fviz_nbclust(happiness, kmeans, method = "wss")# for optimal k=3 compute summary statistics
happiness %>%
mutate(Cluster = k3$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")## # A tibble: 3 x 5
## Cluster happiness_level life_exp avr_daily_earnings sat_rate_with_health_stat~
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 55.9 78.1 55.4 69.3
## 2 2 69.1 78.2 55.8 75.0
## 3 3 59.2 78.1 68.6 73.6
# visualize K-means results with 4 clusters
fviz_cluster(k4, data = happiness,
palette = c("#2E9FDF", "#28B463", "#E7B800", "#FC4E07"),
ellipse.type = "euclid", # Concentration ellipse
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting
ggtheme = theme_minimal()
)# summary results for k = 4 clusters
k4$tot.withinss # total sum of sq.## [1] 3438.896
k4$centers # centers (means) of clusers## happiness_level life_exp avr_daily_earnings sat_rate_with_health_status
## 1 70.49913 77.95681 56.46359 75.40783
## 2 50.89154 78.75987 56.77239 66.14000
## 3 59.40156 78.02649 54.49261 71.27375
## 4 59.18923 78.09380 68.62644 73.59692
k4$size # number of obs. in clusters## [1] 23 13 32 13
# for k=4 add cluster results to the data set
# and compute summary statistics
happiness_cl <- happiness %>%
mutate(cluster_kmeans = k4$cluster)
happiness_cl %>%
group_by(cluster_kmeans) %>%
summarise_all("mean")## # A tibble: 4 x 5
## cluster_kmeans happiness_level life_exp avr_daily_earnin~ sat_rate_with_healt~
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 70.5 78.0 56.5 75.4
## 2 2 50.9 78.8 56.8 66.1
## 3 3 59.4 78.0 54.5 71.3
## 4 4 59.2 78.1 68.6 73.6
The hclust() function implements hierarchical clustering in R.
# use previously created simulated data x (see kmeans clustering)
# clustering with different linkage functions
hc.complete <- hclust(dist(x), method="complete")
hc.average <- hclust(dist(x), method="average")
hc.single <- hclust(dist(x), method="single")
hc.ward2 <- hclust(dist(x), method="ward.D2")The option "method="ward.D2" performs Ward’s minimum variance method. In this method the total within-cluster variance is minimized. It merges the pair of clusters with minimum between-cluster distance at each step. (This option corresponds to cluster::agnes(..., method = "ward") in the {cluster} package.
Visualize them using base R plot function:
# plot them side-by-side using base R plot function
# cex = label size
labsize = 0.8
par(mfrow=c(2,2))
plot(hc.complete,main="Complete Linkage", xlab="", sub="", cex=labsize)
plot(hc.average, main="Average Linkage", xlab="", sub="", cex=labsize)
plot(hc.single, main="Single Linkage", xlab="", sub="", cex=labsize)
plot(hc.ward2, main="Ward Linkage", xlab="", sub="", cex=labsize)# cut dendrogram into groups of data
# k = 2 groups
cutree(hc.complete, k= 2)## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2
cutree(hc.average, 2)## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2
## [39] 2 2 2 2 2 1 2 1 2 2 2 2
cutree(hc.single, 2)## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1
# 4 clusters using single linkage
cutree(hc.single, 4)## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3
## [39] 3 3 3 4 3 3 3 3 3 3 3 3
To scale the variables before performing hierarchical clustering of the observations, we use the scale() function:
xsc <- scale(x)
plot(hclust(dist(xsc), method="complete"), main="Hierarchical Clustering with Scaled Features")# load packages
library(tidyverse) # data manipulation
library(cluster) # clustering algorithms
library(factoextra) # clustering visualization # load data
load("../Data/lifedata2015.RData")
# make province names rownames
lifedata2015 <- column_to_rownames(lifedata2015, var = "province")# hierarchical clustering
# using stats::hclust package
# scale variables before dist() function
# can also use pipes, %>%, see example at the end
hc_complete <- hclust(dist(scale(lifedata2015)), method="complete")
hc_average <- hclust(dist(scale(lifedata2015)), method="average")
hc_single <- hclust(dist(scale(lifedata2015)), method="single")
hc_ward <- hclust(dist(scale(lifedata2015)), method="ward.D2")labsize = 0.7
plot(hc_complete, main = "Complete Linkage", xlab="", sub="", cex=labsize)plot(hc_average, main = "Average Linkage", xlab="", sub="", cex=labsize)plot(hc_single, main = "Single Linkage", xlab="", sub="", cex=labsize)plot(hc_ward, main = "Ward's Method", xlab="", sub="", cex=labsize)# focus on a subset of variables
# use only happiness level and some selected variables
happiness <- lifedata2015 %>%
select(happiness_level, life_exp, avr_daily_earnings, sat_rate_with_health_status)# cluster using hclust
hc_complete <- hclust(dist(scale(happiness)), method="complete")
hc_average <- hclust(dist(scale(happiness)), method="average")
hc_single <- hclust(dist(scale(happiness)), method="single")
hc_ward <- hclust(dist(scale(happiness)), method="ward.D2")plot(hc_complete, main="Complete Linkage", xlab="", sub="", cex=labsize)plot(hc_average, main="Average Linkage", xlab="", sub="", cex=labsize)plot(hc_ward, main="Ward's method", xlab="", sub="", cex=labsize)# form clusters
ncluster = 2
cutree(hc_complete, ncluster)## Adana Adiyaman Afyonkarahisar Agri Amasya
## 1 1 2 2 1
## Ankara Antalya Artvin Aydin Balikesir
## 1 1 1 1 2
## Bilecik Bingöl Bitlis Bolu Burdur
## 1 1 1 2 1
## Bursa Çanakkale Çankiri Çorum Denizli
## 1 1 2 1 1
## Diyarbakir Edirne Elazig Erzincan Erzurum
## 1 1 1 1 1
## Eskisehir Gaziantep Giresun Gümüshane Hakkari
## 1 2 1 1 2
## Hatay Isparta Mersin Istanbul Izmir
## 1 2 1 1 1
## Kars Kastamonu Kayseri Kirklareli Kirsehir
## 1 1 1 1 1
## Kocaeli Konya Kütahya Malatya Manisa
## 1 2 2 1 1
## Kahramanmaras Mardin Mugla Mus Nevsehir
## 1 1 1 1 1
## Nigde Ordu Rize Sakarya Samsun
## 2 1 2 2 1
## Siirt Sinop Sivas Tekirdag Tokat
## 1 2 1 1 1
## Trabzon Tunceli Sanliurfa Usak Van
## 1 1 1 2 2
## Yozgat Zonguldak Aksaray Bayburt Karaman
## 1 1 1 2 2
## Kirikkale Batman Sirnak Bartin Ardahan
## 2 1 2 1 2
## Igdir Yalova Karabük Kilis Osmaniye
## 1 2 1 2 1
## Düzce
## 2
# plot dendrogram
plot(hc_complete)
# draw rectangles around clusters
rect.hclust(hc_complete , k = ncluster, border = 2:6) # colorize branches using dendextend library
library(dendextend)
avg_dend_obj <- as.dendrogram(hc_complete)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)# Cut tree into groups
hc_cluster2 <- cutree(hc_complete, k = 2)
# Number of members in each cluster
table(hc_cluster2)## hc_cluster2
## 1 2
## 57 24
# visualize result in a scatter plot using factoextra package
# 2 clusters
library(factoextra)
fviz_cluster(list(data = happiness, cluster = hc_cluster2))# we can add cluster information into data
happiness %>%
mutate(cluster = hc_cluster2) %>%
head## happiness_level life_exp avr_daily_earnings
## Adana 53.00 77.39335 59.06489
## Adiyaman 65.01 79.54820 53.24281
## Afyonkarahisar 76.43 76.99212 53.91157
## Agri 60.09 75.62830 56.10804
## Amasya 66.02 77.76714 53.77365
## Ankara 56.23 79.37352 70.14222
## sat_rate_with_health_status cluster
## Adana 68.47 1
## Adiyaman 69.13 1
## Afyonkarahisar 80.07 2
## Agri 66.20 2
## Amasya 74.16 1
## Ankara 71.76 1
# Cut tree into 3 groups
# use Ward method results
hc_cluster3 <- cutree(hc_ward, k = 3)
# Number of members in each cluster
table(hc_cluster3)## hc_cluster3
## 1 2 3
## 31 38 12
# visualize dendrogram
# use factoextra::hcut() function
hres3 <- hcut(happiness, k = 3, stand = TRUE)
# Visualize
fviz_dend(hres3, rect = TRUE, cex = 0.5,
k_colors = c("#F1C40F","#28B463", "#E67E22"))# for html color codes visit https://htmlcolorcodes.com/ # visualize result in a scatter plot using factoextra package
# 3 clusters
library(factoextra)
fviz_cluster(list(data = happiness, cluster = hc_cluster3),
palette = c("#F1C40F", "#E67E22", "#28B463"),
repel = TRUE # avoid overplotting text labels
)# visualize dendrogram
# use factoextra::hcut() function
# cut into 4 clusters
hres4 <- hcut(happiness, k = 4, stand = TRUE) # default is hc_method = "ward.D2"
# Visualize
fviz_dend(hres4, cex = 0.5,
k_colors = c("#F1C40F", "#28B463", "#E67E22", "#3498DB"),
color_labels_by_k = TRUE,
rect = TRUE,
rect_border = c("#F1C40F","#28B463", "#E67E22", "#3498DB"),
rect_fill = TRUE
)# horizontal dendrogram:
# add horiz = TRUE option
fviz_dend(hres4, cex = 0.5, horiz = TRUE,
k_colors = c("#F1C40F", "#28B463", "#E67E22", "#3498DB"),
color_labels_by_k = TRUE,
rect = TRUE,
rect_border = c("#F1C40F","#28B463", "#E67E22", "#3498DB"),
rect_fill = TRUE
)# circular dendrogram
# add type = "circular" option
fviz_dend(hres4, cex = 0.5, type = "circular",
k_colors = c("#F1C40F", "#28B463", "#E67E22", "#3498DB")
)# Alternatively dendrogram may be cut inside the fviz_dend
fviz_dend(hc_ward, # cluster results
k = 4, # desired number of clusters
rect = TRUE, # draw rectangles around clusters
cex = 0.5 # label size (province names)
)# 4 clusters using Ward's method
# Cut tree into 4 groups
hc_cluster_ward4 <- cutree(hc_ward, k = 4)
library(factoextra)
fviz_cluster(list(data = happiness, cluster = hc_cluster_ward4),
palette = c("#F1C40F", "#3498DB", "#E67E22", "#28B463"),
repel = TRUE)# add hierarchical clustering result to the data set
happiness_cl <- happiness_cl %>%
mutate(cluster_hc = hres4$cluster)
# cluster means
happiness_cl %>%
select(-cluster_kmeans) %>%
group_by(cluster_hc) %>%
summarize_all(mean)## # A tibble: 4 x 5
## cluster_hc happiness_level life_exp avr_daily_earnings sat_rate_with_health_s~
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 56.0 77.8 55.6 68.4
## 2 2 60.2 79.2 58.0 73.2
## 3 3 70.6 77.9 54.3 74.7
## 4 4 62.8 77.4 67.4 75.4
Illustration of using the pipe operator %>%:
# using pipes
# dend <- 1:10 %>% dist %>% hclust %>% as.dendrogram
# dend %>% plot()
dend <- happiness %>% # take the data
scale() %>% # standardize
dist %>% # calculate a distance matrix,
hclust(method = "complete") %>% # hierarchical clustering using
as.dendrogram # turn it into a dendrogram.
dend %>% plot()